{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%cd /data/ajay/contracode\n",
    "%pwd\n",
    "import pandas as pd\n",
    "import plotnine as p9\n",
    "%matplotlib inline\n",
    "import jsonlines\n",
    "import random\n",
    "from pprint import pprint\n",
    "from tqdm.auto import tqdm\n",
    "import pickle\n",
    "from collections import defaultdict, Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_data(fname):\n",
    "    with jsonlines.open(fname) as reader:\n",
    "        return list(reader)\n",
    "data = load_data('data/codesearchnet_javascript/javascript_train_supervised.jsonl')\n",
    "\n",
    "def extract_fields(fields, gen):\n",
    "    for item in gen:\n",
    "        yield \n",
    "\n",
    "def parse_data(key, data):\n",
    "    matches = filter(lambda x: key.lower() == x['func_name'].lower(), data)\n",
    "    extracted_matches = map(lambda item: {k: v for k, v in item.items() if k in ['func_name', 'code']}, matches)\n",
    "    return list(extracted_matches)\n",
    "\n",
    "matches = defaultdict(list)\n",
    "for x in tqdm(data):\n",
    "    matches[x['func_name'].lower()].append(x['code'])\n",
    "\n",
    "match_counts = Counter()\n",
    "for k, v in matches.items():\n",
    "    match_counts[k] += len(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "match_counts.most_common(25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time dataset_indexes = list(enumerate(x[1] for x in match_counts.most_common(20000)))\n",
    "%time df = pd.DataFrame(dataset_indexes, columns=['Method', 'Frequency'])\n",
    "(p9.ggplot(df, p9.aes(x=\"Method\", y=\"Frequency\")) + p9.geom_bar(stat = \"identity\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_methods = set(x[0] for x in match_counts.most_common(500))\n",
    "df_data = []\n",
    "for method in tqdm(top_methods):\n",
    "    df_data.extend(parse_data(method, data))\n",
    "grouped_methods_df = pd.DataFrame(df_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped_methods_df.to_pickle('data/tsne/supervised_top500_groups_df.pickle') \n",
    "grouped_methods_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pack augmented dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/dev/shm/javascript_augmented.pickle', 'rb') as f:\n",
    "    dataset = pickle.loads(f.read())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import modin.pandas as pd\n",
    "df = pd.DataFrame(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import random\n",
    "shuffled_list = list(range(len(dataset)))\n",
    "random.shuffle(shuffled_list)\n",
    "sample_10k = [dataset[i] for i in shuffled_list[:10000]]\n",
    "sample_df = pd.DataFrame([dict(text=list(x)) for x in sample_10k], columns=['text'])\n",
    "sample_df.to_pickle('/data/ajay/contracode/data/codesearchnet_javascript/augmented_10k_subset.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.read_pickle('/data/ajay/contracode/data/codesearchnet_javascript/augmented_10k_subset.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "moco_path = \"data/tsne/moco_embed.pickle\"\n",
    "!ls -lah {moco_path}\n",
    "with open(moco_path, 'rb') as f:\n",
    "    x = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pickle.loads(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
